{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# import conga package"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# import conga: you would need to point this to wherever you downloaded the repository\n",
    "# this is the top level folder for the repository (ie, it should contain scripts/ conga/ examples/)\n",
    "path_to_conga = '/home/pbradley/gitrepos/conga/'\n",
    "import sys\n",
    "sys.path.append(path_to_conga)\n",
    "import conga\n",
    "import numpy as np # not needed for tcrdist\n",
    "import pandas as pd # not needed for tcrdist\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# create tcrdist calculator object"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# create tcrdist calculator for human alpha-beta tcrs\n",
    "# other possible values for organism are\n",
    "# 'mouse', 'human_gd', 'mouse_gd', human_ig' \n",
    "organism = 'human'\n",
    "tcrdist = conga.tcrdist.tcr_distances.TcrDistCalculator(organism)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# do some calculations"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "133.0"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# within conga the tcrs are stored as columns in the AnnData.obs array\n",
    "# for calculations they are extracted into lists of tuples [tcr1, tcr2, ...]\n",
    "# where each tcr tuple looks like ((va,ja,cdr3a,cdr3a_nucseq), (vb,jb,cdr3b,cdr3b_nucseq))\n",
    "# but for TCRdist we only need the V gene and CDR3, so we leave the J genes as None below and skip the nucleotide seqs\n",
    "tcr1 = (('TRAV1-1*01', None, 'CAVEALTGGGNKLTF'), ('TRBV5-6*01', None, 'CASSAYTSGPKEQYF'))\n",
    "tcr2 = (('TRAV1-1*01', None, 'CAVPGITGGGNKLTF'), ('TRBV5-4*01', None, 'CASSLEQGPLQYF'))\n",
    "\n",
    "tcrdist(tcr1,tcr2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "103.0"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# or we can do single-chain distances\n",
    "tcrb1 = ('TRBV5-6*01', None, 'CASSAYTSGPKEQYF')\n",
    "tcrb2 = ('TRBV5-4*01', None, 'CASSLEQGPLQYF')\n",
    "\n",
    "tcrdist.single_chain_distance(tcrb1,tcrb2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# how do we figure out which V genes are understood by TCRdist?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "dict_keys(['TRAV1-1*01', 'TRAV1-1*02', 'TRAV1-2*01', 'TRAV1-2*02', 'TRAV10*01', 'TRAV11*01', 'TRAV12-1*01', 'TRAV12-1*02', 'TRAV12-2*01', 'TRAV12-2*02', 'TRAV12-2*03', 'TRAV12-3*01', 'TRAV12-3*02', 'TRAV13-1*01', 'TRAV13-1*02', 'TRAV13-1*03', 'TRAV13-2*01', 'TRAV13-2*02', 'TRAV14/DV4*01', 'TRAV14/DV4*02', 'TRAV14/DV4*03', 'TRAV14/DV4*04', 'TRAV16*01', 'TRAV17*01', 'TRAV18*01', 'TRAV19*01', 'TRAV2*01', 'TRAV2*02', 'TRAV20*01', 'TRAV20*02', 'TRAV20*03', 'TRAV20*04', 'TRAV21*01', 'TRAV21*02', 'TRAV22*01', 'TRAV23/DV6*01', 'TRAV23/DV6*02', 'TRAV23/DV6*03', 'TRAV23/DV6*04', 'TRAV24*01', 'TRAV24*02', 'TRAV25*01', 'TRAV26-1*01', 'TRAV26-1*02', 'TRAV26-1*03', 'TRAV26-2*01', 'TRAV26-2*02', 'TRAV27*01', 'TRAV27*02', 'TRAV27*03', 'TRAV29/DV5*01', 'TRAV29/DV5*02', 'TRAV3*01', 'TRAV30*01', 'TRAV30*02', 'TRAV30*03', 'TRAV30*04', 'TRAV34*01', 'TRAV35*01', 'TRAV35*02', 'TRAV36/DV7*01', 'TRAV36/DV7*02', 'TRAV36/DV7*03', 'TRAV36/DV7*04', 'TRAV38-1*01', 'TRAV38-1*02', 'TRAV38-1*03', 'TRAV38-1*04', 'TRAV38-2/DV8*01', 'TRAV39*01', 'TRAV4*01', 'TRAV40*01', 'TRAV41*01', 'TRAV5*01', 'TRAV6*01', 'TRAV6*02', 'TRAV6*03', 'TRAV6*04', 'TRAV6*05', 'TRAV6*06', 'TRAV7*01', 'TRAV8-1*01', 'TRAV8-1*02', 'TRAV8-2*01', 'TRAV8-2*02', 'TRAV8-3*01', 'TRAV8-3*02', 'TRAV8-3*03', 'TRAV8-4*01', 'TRAV8-4*02', 'TRAV8-4*03', 'TRAV8-4*04', 'TRAV8-4*05', 'TRAV8-4*06', 'TRAV8-4*07', 'TRAV8-6*01', 'TRAV8-6*02', 'TRAV8-7*01', 'TRAV9-1*01', 'TRAV9-2*01', 'TRAV9-2*02', 'TRAV9-2*03', 'TRAV9-2*04', 'TRAJ1*01', 'TRAJ10*01', 'TRAJ11*01', 'TRAJ12*01', 'TRAJ13*01', 'TRAJ13*02', 'TRAJ14*01', 'TRAJ15*01', 'TRAJ15*02', 'TRAJ16*01', 'TRAJ17*01', 'TRAJ18*01', 'TRAJ19*01', 'TRAJ2*01', 'TRAJ20*01', 'TRAJ21*01', 'TRAJ22*01', 'TRAJ23*01', 'TRAJ23*02', 'TRAJ24*01', 'TRAJ24*02', 'TRAJ25*01', 'TRAJ26*01', 'TRAJ27*01', 'TRAJ28*01', 'TRAJ29*01', 'TRAJ3*01', 'TRAJ30*01', 'TRAJ31*01', 'TRAJ32*01', 'TRAJ32*02', 'TRAJ33*01', 'TRAJ34*01', 'TRAJ35*01', 'TRAJ36*01', 'TRAJ37*01', 'TRAJ37*02', 'TRAJ38*01', 'TRAJ39*01', 'TRAJ4*01', 'TRAJ40*01', 'TRAJ41*01', 'TRAJ42*01', 'TRAJ43*01', 'TRAJ44*01', 'TRAJ45*01', 'TRAJ46*01', 'TRAJ47*01', 'TRAJ47*02', 'TRAJ48*01', 'TRAJ49*01', 'TRAJ5*01', 'TRAJ50*01', 'TRAJ51*01', 'TRAJ52*01', 'TRAJ53*01', 'TRAJ54*01', 'TRAJ55*01', 'TRAJ56*01', 'TRAJ57*01', 'TRAJ58*01', 'TRAJ59*01', 'TRAJ6*01', 'TRAJ60*01', 'TRAJ61*01', 'TRAJ7*01', 'TRAJ8*01', 'TRAJ9*01', 'TRBV1*01', 'TRBV10-1*01', 'TRBV10-1*02', 'TRBV10-2*01', 'TRBV10-2*02', 'TRBV10-3*01', 'TRBV10-3*02', 'TRBV10-3*03', 'TRBV10-3*04', 'TRBV11-1*01', 'TRBV11-2*01', 'TRBV11-2*02', 'TRBV11-2*03', 'TRBV11-3*01', 'TRBV11-3*02', 'TRBV11-3*03', 'TRBV12-1*01', 'TRBV12-2*01', 'TRBV12-3*01', 'TRBV12-4*01', 'TRBV12-4*02', 'TRBV12-5*01', 'TRBV13*01', 'TRBV13*02', 'TRBV14*01', 'TRBV14*02', 'TRBV15*01', 'TRBV15*02', 'TRBV15*03', 'TRBV16*01', 'TRBV16*02', 'TRBV16*03', 'TRBV17*01', 'TRBV18*01', 'TRBV19*01', 'TRBV19*02', 'TRBV19*03', 'TRBV2*01', 'TRBV2*02', 'TRBV2*03', 'TRBV20-1*01', 'TRBV20-1*02', 'TRBV20-1*03', 'TRBV20-1*04', 'TRBV20-1*05', 'TRBV20-1*06', 'TRBV20-1*07', 'TRBV20/OR9-2*01', 'TRBV20/OR9-2*02', 'TRBV20/OR9-2*03', 'TRBV21-1*01', 'TRBV21/OR9-2*01', 'TRBV23-1*01', 'TRBV23/OR9-2*01', 'TRBV23/OR9-2*02', 'TRBV24-1*01', 'TRBV24/OR9-2*01', 'TRBV25-1*01', 'TRBV26*01', 'TRBV26/OR9-2*01', 'TRBV26/OR9-2*02', 'TRBV27*01', 'TRBV28*01', 'TRBV29-1*01', 'TRBV29-1*02', 'TRBV29-1*03', 'TRBV29/OR9-2*01', 'TRBV29/OR9-2*02', 'TRBV3-1*01', 'TRBV3-1*02', 'TRBV3-2*01', 'TRBV3-2*02', 'TRBV3-2*03', 'TRBV30*01', 'TRBV30*02', 'TRBV30*04', 'TRBV30*05', 'TRBV4-1*01', 'TRBV4-1*02', 'TRBV4-2*01', 'TRBV4-2*02', 'TRBV4-3*01', 'TRBV4-3*02', 'TRBV4-3*03', 'TRBV4-3*04', 'TRBV5-1*01', 'TRBV5-1*02', 'TRBV5-3*01', 'TRBV5-3*02', 'TRBV5-4*01', 'TRBV5-4*02', 'TRBV5-4*03', 'TRBV5-4*04', 'TRBV5-5*01', 'TRBV5-5*02', 'TRBV5-5*03', 'TRBV5-6*01', 'TRBV5-7*01', 'TRBV5-8*01', 'TRBV5-8*02', 'TRBV6-1*01', 'TRBV6-2*01', 'TRBV6-3*01', 'TRBV6-4*01', 'TRBV6-4*02', 'TRBV6-5*01', 'TRBV6-6*01', 'TRBV6-6*02', 'TRBV6-6*03', 'TRBV6-6*04', 'TRBV6-6*05', 'TRBV6-7*01', 'TRBV6-8*01', 'TRBV6-9*01', 'TRBV7-1*01', 'TRBV7-2*01', 'TRBV7-2*02', 'TRBV7-2*03', 'TRBV7-2*04', 'TRBV7-3*01', 'TRBV7-3*02', 'TRBV7-3*03', 'TRBV7-3*04', 'TRBV7-3*05', 'TRBV7-4*01', 'TRBV7-6*01', 'TRBV7-6*02', 'TRBV7-7*01', 'TRBV7-7*02', 'TRBV7-8*01', 'TRBV7-8*02', 'TRBV7-8*03', 'TRBV7-9*01', 'TRBV7-9*02', 'TRBV7-9*03', 'TRBV7-9*04', 'TRBV7-9*05', 'TRBV7-9*06', 'TRBV7-9*07', 'TRBV9*01', 'TRBV9*02', 'TRBV9*03', 'TRBJ1-1*01', 'TRBJ1-2*01', 'TRBJ1-3*01', 'TRBJ1-4*01', 'TRBJ1-5*01', 'TRBJ1-6*01', 'TRBJ1-6*02', 'TRBJ2-1*01', 'TRBJ2-2*01', 'TRBJ2-2P*01', 'TRBJ2-3*01', 'TRBJ2-4*01', 'TRBJ2-5*01', 'TRBJ2-6*01', 'TRBJ2-7*01', 'TRBJ2-7*02', 'TRBD1*01', 'TRBD2*01', 'TRBD2*02'])"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# the information on genes and cdrs is stored in the all_genes dictionary.\n",
    "# for tcrdist the V genes need to be in this dictionary. Note that they\n",
    "# have allele information (eg '*01'). If 10x doesn't provide that\n",
    "# we can just append '*01' to the V gene.\n",
    "info = conga.tcrdist.all_genes.all_genes['human']\n",
    "info.keys()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', 'alseq', 'cdr_columns', 'cdrs', 'chain', 'count_rep', 'id', 'mm1_rep', 'nucseq', 'nucseq_offset', 'organism', 'protseq', 'region', 'rep']\n",
      "A V ['TSG......FNG', 'NVL....DGL', 'SRSKGY', 'CAVR.....']\n"
     ]
    }
   ],
   "source": [
    "# info maps from gene names to objects that store some info about the genes\n",
    "trav12_info = info['TRAV1-2*01']\n",
    "print(dir(trav12_info))\n",
    "print(trav12_info.chain,\n",
    "      trav12_info.region,\n",
    "      trav12_info.cdrs)\n",
    "      "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>organism</th>\n",
       "      <th>chain</th>\n",
       "      <th>region</th>\n",
       "      <th>nucseq</th>\n",
       "      <th>frame</th>\n",
       "      <th>aligned_protseq</th>\n",
       "      <th>cdr_columns</th>\n",
       "      <th>cdrs</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>TRAV1*01</td>\n",
       "      <td>mouse</td>\n",
       "      <td>A</td>\n",
       "      <td>V</td>\n",
       "      <td>ggacagggcgtggagcagcctgacaacttgatgtctgtagagggaa...</td>\n",
       "      <td>1</td>\n",
       "      <td>GQGVEQ.P.DNLMSVEGTFARVNCTYSTSG......FNGLSWYQQR...</td>\n",
       "      <td>28-39;57-66;82-88;106-111</td>\n",
       "      <td>TSG......FNG;VVL....DGL;SRSN.GY;CAVR..</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>TRAV1*02</td>\n",
       "      <td>mouse</td>\n",
       "      <td>A</td>\n",
       "      <td>V</td>\n",
       "      <td>ggacagggtgtggagcagcctgccaaattgatgtctgtggagggaa...</td>\n",
       "      <td>1</td>\n",
       "      <td>GQGVEQ.P.AKLMSVEGTFARVNCTYSTSG......FNGLSWYQQR...</td>\n",
       "      <td>28-39;57-66;82-88;106-111</td>\n",
       "      <td>TSG......FNG;VVL....DGL;SRSN.GY;CAVR..</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>TRAV10*01</td>\n",
       "      <td>mouse</td>\n",
       "      <td>A</td>\n",
       "      <td>V</td>\n",
       "      <td>ggagagaaggtcgagcaacacgagtctacactgagtgttcgagagg...</td>\n",
       "      <td>1</td>\n",
       "      <td>GEKVEQHE.STLSVREGDSAVINCTYTDTA......SSYFPWYKQE...</td>\n",
       "      <td>28-39;57-66;82-88;106-111</td>\n",
       "      <td>DTA......SSY;IRSN...VDR;DKKA.KR;CAAS..</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>TRAV10*02</td>\n",
       "      <td>mouse</td>\n",
       "      <td>A</td>\n",
       "      <td>V</td>\n",
       "      <td>ggagagaaggtcgagcaacatgagtctacactgagtgttcgagagg...</td>\n",
       "      <td>1</td>\n",
       "      <td>GEKVEQHE.STLSVREGDSAVINCTYTDTA......SSYFPWYKQE...</td>\n",
       "      <td>28-39;57-66;82-88;106-111</td>\n",
       "      <td>DTA......SSY;IRSN...VDR;DKKA.KR;CAAS..</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>TRAV10*03</td>\n",
       "      <td>mouse</td>\n",
       "      <td>A</td>\n",
       "      <td>V</td>\n",
       "      <td>ggagagaaggtcgagcaacacgagtctacacttagtgttcaagagg...</td>\n",
       "      <td>1</td>\n",
       "      <td>GEKVEQHE.STLSVQEGDSAVINCTYTDTA......SSYFPWYKQE...</td>\n",
       "      <td>28-39;57-66;82-88;106-111</td>\n",
       "      <td>DTA......SSY;IRSN...VDR;DKKA.KR;CAAA..</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          id organism chain region  \\\n",
       "0   TRAV1*01    mouse     A      V   \n",
       "1   TRAV1*02    mouse     A      V   \n",
       "2  TRAV10*01    mouse     A      V   \n",
       "3  TRAV10*02    mouse     A      V   \n",
       "4  TRAV10*03    mouse     A      V   \n",
       "\n",
       "                                              nucseq  frame  \\\n",
       "0  ggacagggcgtggagcagcctgacaacttgatgtctgtagagggaa...      1   \n",
       "1  ggacagggtgtggagcagcctgccaaattgatgtctgtggagggaa...      1   \n",
       "2  ggagagaaggtcgagcaacacgagtctacactgagtgttcgagagg...      1   \n",
       "3  ggagagaaggtcgagcaacatgagtctacactgagtgttcgagagg...      1   \n",
       "4  ggagagaaggtcgagcaacacgagtctacacttagtgttcaagagg...      1   \n",
       "\n",
       "                                     aligned_protseq  \\\n",
       "0  GQGVEQ.P.DNLMSVEGTFARVNCTYSTSG......FNGLSWYQQR...   \n",
       "1  GQGVEQ.P.AKLMSVEGTFARVNCTYSTSG......FNGLSWYQQR...   \n",
       "2  GEKVEQHE.STLSVREGDSAVINCTYTDTA......SSYFPWYKQE...   \n",
       "3  GEKVEQHE.STLSVREGDSAVINCTYTDTA......SSYFPWYKQE...   \n",
       "4  GEKVEQHE.STLSVQEGDSAVINCTYTDTA......SSYFPWYKQE...   \n",
       "\n",
       "                 cdr_columns                                    cdrs  \n",
       "0  28-39;57-66;82-88;106-111  TSG......FNG;VVL....DGL;SRSN.GY;CAVR..  \n",
       "1  28-39;57-66;82-88;106-111  TSG......FNG;VVL....DGL;SRSN.GY;CAVR..  \n",
       "2  28-39;57-66;82-88;106-111  DTA......SSY;IRSN...VDR;DKKA.KR;CAAS..  \n",
       "3  28-39;57-66;82-88;106-111  DTA......SSY;IRSN...VDR;DKKA.KR;CAAS..  \n",
       "4  28-39;57-66;82-88;106-111  DTA......SSY;IRSN...VDR;DKKA.KR;CAAA..  "
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# the information in the all_genes dictionary is read from this tsv file\n",
    "tsvfile = path_to_conga+'conga/tcrdist/db/combo_xcr.tsv'\n",
    "df = pd.read_csv(tsvfile, sep='\\t')\n",
    "df.head()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "id                                                          TRAV1*01\n",
       "organism                                                       mouse\n",
       "chain                                                              A\n",
       "region                                                             V\n",
       "nucseq             ggacagggcgtggagcagcctgacaacttgatgtctgtagagggaa...\n",
       "frame                                                              1\n",
       "aligned_protseq    GQGVEQ.P.DNLMSVEGTFARVNCTYSTSG......FNGLSWYQQR...\n",
       "cdr_columns                                28-39;57-66;82-88;106-111\n",
       "cdrs                          TSG......FNG;VVL....DGL;SRSN.GY;CAVR..\n",
       "Name: 0, dtype: object"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.iloc[0]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# compute a tcrdist matrix: python"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "20"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# lets say we had a long list of tcrs\n",
    "\n",
    "tcrs = [(('TRAV1-1*01',\n",
    "   'TRAJ10*01',\n",
    "   'CAVEALTGGGNKLTF',\n",
    "   'tgcgctgtggaggcactcacgggaggaggaaacaaactcaccttt'),\n",
    "  ('TRBV5-6*01',\n",
    "   'TRBJ2-7*01',\n",
    "   'CASSAYTSGPKEQYF',\n",
    "   'tgtgccagcagcgcgtacactagcggacctaaagagcagtacttc')),\n",
    " (('TRAV1-1*01',\n",
    "   'TRAJ10*01',\n",
    "   'CAVPGITGGGNKLTF',\n",
    "   'tgcgctgtgcccggaatcacgggaggaggaaacaaactcaccttt'),\n",
    "  ('TRBV5-4*01',\n",
    "   'TRBJ2-5*01',\n",
    "   'CASSLEQGPLQYF',\n",
    "   'tgtgccagcagcctggagcagggaccccttcagtacttc')),\n",
    " (('TRAV1-1*01',\n",
    "   'TRAJ10*01',\n",
    "   'CAVRDLGLTGGGNKLTF',\n",
    "   'tgcgctgtgagagatctggggctcacgggaggaggaaacaaactcaccttt'),\n",
    "  ('TRBV9*01',\n",
    "   'TRBJ2-2*01',\n",
    "   'CASSVEKRGGAGELFF',\n",
    "   'tgtgccagcagcgtagagaagcgggggggtgccggggagctgtttttt')),\n",
    " (('TRAV1-1*01',\n",
    "   'TRAJ10*01',\n",
    "   'CAVRGTGGGNKLTF',\n",
    "   'tgtgctgtgagaggtacgggaggaggaaacaaactcaccttt'),\n",
    "  ('TRBV2*01',\n",
    "   'TRBJ2-7*01',\n",
    "   'CASSEAPTGWEQYF',\n",
    "   'tgtgccagcagtgaagccccgacagggtgggagcagtacttc')),\n",
    " (('TRAV1-1*01',\n",
    "   'TRAJ10*01',\n",
    "   'CAVRVGGGNKLTF',\n",
    "   'tgcgctgtgagggtgggaggaggaaacaaactcaccttt'),\n",
    "  ('TRBV11-2*01',\n",
    "   'TRBJ1-1*01',\n",
    "   'CASSLDDVGGGFMNTEAFF',\n",
    "   'tgtgccagcagcttagacgatgttggaggggggttcatgaacactgaagctttcttt')),\n",
    " (('TRAV1-1*01',\n",
    "   'TRAJ10*01',\n",
    "   'CAVSTGGGNKLTF',\n",
    "   'tgcgctgtgagcacgggaggaggaaacaaactcaccttt'),\n",
    "  ('TRBV20-1*01',\n",
    "   'TRBJ2-1*01',\n",
    "   'CSARARQGLFLNEQFF',\n",
    "   'tgcagtgctagagcgagacaggggcttttcttgaatgagcagttcttc')),\n",
    " (('TRAV1-1*01',\n",
    "   'TRAJ11*01',\n",
    "   'CAALGGYSTLTF',\n",
    "   'tgcgctgccctcggaggatacagcaccctcaccttt'),\n",
    "  ('TRBV4-1*01',\n",
    "   'TRBJ2-2*01',\n",
    "   'CASSHSPGLAGGHTGELFF',\n",
    "   'tgcgccagcagccactcgccgggactagcgggagggcacaccggggagctgtttttt')),\n",
    " (('TRAV1-1*01',\n",
    "   'TRAJ11*01',\n",
    "   'CAVRDKNSGYSTLTF',\n",
    "   'tgcgctgtgagagataagaattcaggatacagcaccctcaccttt'),\n",
    "  ('TRBV6-3*01',\n",
    "   'TRBJ1-2*01',\n",
    "   'CASTSADGYGYTF',\n",
    "   'tgtgccagcacctcggcggacgggtatggctacaccttc')),\n",
    " (('TRAV1-1*01',\n",
    "   'TRAJ11*01',\n",
    "   'CAVRSDSGYSTLTF',\n",
    "   'tgcgctgtgagaagcgattcaggatacagcaccctcaccttt'),\n",
    "  ('TRBV15*01',\n",
    "   'TRBJ2-7*01',\n",
    "   'CATTPEGAPYEQYF',\n",
    "   'tgtgccaccaccccggagggagcgccctacgagcagtacttc')),\n",
    " (('TRAV1-1*01',\n",
    "   'TRAJ12*01',\n",
    "   'CAAEGMDSSYKLIF',\n",
    "   'tgcgctgccgaagggatggatagcagctataaattgatcttc'),\n",
    "  ('TRBV5-6*01',\n",
    "   'TRBJ2-5*01',\n",
    "   'CASSLGTGTSGPQETQYF',\n",
    "   'tgtgccagcagcctggggacaggaacatcgggacctcaagagacccagtacttc')),\n",
    " (('TRAV1-1*01',\n",
    "   'TRAJ12*01',\n",
    "   'CAGIQDSSYKLIF',\n",
    "   'tgcgccgggatacaagatagcagctataaattgatcttc'),\n",
    "  ('TRBV20-1*01',\n",
    "   'TRBJ2-1*01',\n",
    "   'CSARAKRGLFNEQFF',\n",
    "   'tgcagtgctagagcaaagcggggcctcttcaatgagcagttcttc')),\n",
    " (('TRAV1-1*01',\n",
    "   'TRAJ12*01',\n",
    "   'CAKVDSSYKLIF',\n",
    "   'tgcgctaaagtggatagcagctataaattgatcttc'),\n",
    "  ('TRBV20-1*01',\n",
    "   'TRBJ1-1*01',\n",
    "   'CSARRPGQEVTEAFF',\n",
    "   'tgcagtgctaggaggccgggacaggaggtcactgaagctttcttt')),\n",
    " (('TRAV1-1*01',\n",
    "   'TRAJ12*01',\n",
    "   'CASPGGDSSYKLIF',\n",
    "   'tgcgcttccccggggggggatagcagctacaaattgatcttc'),\n",
    "  ('TRBV19*01',\n",
    "   'TRBJ2-2*01',\n",
    "   'CASSPSEANTGELFF',\n",
    "   'tgtgccagtagtccatcggaggcgaacaccggggagctgtttttt')),\n",
    " (('TRAV1-1*01',\n",
    "   'TRAJ12*01',\n",
    "   'CAVEKDSSYKLIF',\n",
    "   'tgcgctgtggaaaaggatagcagctataaattgatcttc'),\n",
    "  ('TRBV12-4*01',\n",
    "   'TRBJ2-7*01',\n",
    "   'CASKREDYEQYF',\n",
    "   'tgtgccagcaagagggaggactacgagcagtacttc')),\n",
    " (('TRAV1-1*01',\n",
    "   'TRAJ12*01',\n",
    "   'CAVLHSSYKLIF',\n",
    "   'tgcgctgtgctgcatagcagctataaattgatcttc'),\n",
    "  ('TRBV7-9*01',\n",
    "   'TRBJ2-7*01',\n",
    "   'CASSSPGTGTSYEQYF',\n",
    "   'tgtgccagcagctccccagggacagggacttcctacgagcagtacttc')),\n",
    " (('TRAV1-1*01',\n",
    "   'TRAJ12*01',\n",
    "   'CAVRDGKSGDSSYKLIF',\n",
    "   'tgcgctgtgagagatggaaagagcggggatagcagctataaattgatcttc'),\n",
    "  ('TRBV19*01',\n",
    "   'TRBJ2-5*01',\n",
    "   'CASSSGSGPSQETQYF',\n",
    "   'tgtgccagtagttcaggtagcgggccgtcccaagagacccagtacttc')),\n",
    " (('TRAV1-1*01',\n",
    "   'TRAJ12*01',\n",
    "   'CAVRDPVDSSYKLIF',\n",
    "   'tgcgctgtgagagatccggtggatagcagctataaattgatcttc'),\n",
    "  ('TRBV6-3*01',\n",
    "   'TRBJ2-5*01',\n",
    "   'CASSPEREGSEETQYF',\n",
    "   'tgtgccagcagtccggaacgggaggggagtgaagagacccagtacttc')),\n",
    " (('TRAV1-1*01',\n",
    "   'TRAJ12*01',\n",
    "   'CAVREDSSYKLIF',\n",
    "   'tgcgctgtgagagaggatagcagctataaattgatcttc'),\n",
    "  ('TRBV7-2*01',\n",
    "   'TRBJ2-2*01',\n",
    "   'CASSLGPQGTGELFF',\n",
    "   'tgtgccagcagcttaggcccccagggaaccggggagctgtttttt')),\n",
    " (('TRAV1-1*01',\n",
    "   'TRAJ12*01',\n",
    "   'CAVSRMDSSYKLIF',\n",
    "   'tgcgctgtgtcccggatggatagcagctataaattgatcttc'),\n",
    "  ('TRBV19*01',\n",
    "   'TRBJ2-7*01',\n",
    "   'CASSHLLSTVDYEQYF',\n",
    "   'tgtgccagtagtcacctcctatcgacagtggactacgagcagtacttc')),\n",
    " (('TRAV1-1*01',\n",
    "   'TRAJ13*01',\n",
    "   'CAASGGYQKVTF',\n",
    "   'tgcgctgcgtctgggggttaccagaaagttaccttt'),\n",
    "  ('TRBV28*01',\n",
    "   'TRBJ1-5*01',\n",
    "   'CASSSGADSLYQPQHF',\n",
    "   'tgtgccagcagcagcggggctgacagtttatatcagccccagcatttt'))]\n",
    "len(tcrs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[  0., 133., 174., 176., 249., 219., 295., 260., 217., 204., 243.,\n",
       "        267., 239., 225., 223., 272., 254., 229., 230., 246.],\n",
       "       [133.,   0., 189., 152., 243., 247., 295., 229., 219., 208., 268.,\n",
       "        277., 233., 216., 247., 281., 247., 241., 254., 256.],\n",
       "       [174., 189.,   0., 208., 254., 271., 291., 285., 251., 273., 283.,\n",
       "        298., 265., 277., 264., 277., 249., 223., 283., 306.],\n",
       "       [176., 152., 208.,   0., 216., 225., 267., 204., 181., 248., 249.,\n",
       "        258., 220., 193., 195., 256., 237., 208., 229., 241.],\n",
       "       [249., 243., 254., 216.,   0., 235., 253., 293., 272., 261., 289.,\n",
       "        259., 268., 259., 237., 292., 281., 228., 274., 273.],\n",
       "       [219., 247., 271., 225., 235.,   0., 274., 290., 254., 291., 138.,\n",
       "        165., 260., 239., 248., 281., 281., 253., 248., 236.],\n",
       "       [295., 295., 291., 267., 253., 274.,   0., 286., 241., 274., 283.,\n",
       "        274., 219., 279., 234., 315., 304., 236., 282., 239.],\n",
       "       [260., 229., 285., 204., 293., 290., 286.,   0., 173., 314., 293.,\n",
       "        278., 256., 246., 265., 262., 213., 263., 262., 259.],\n",
       "       [217., 219., 251., 181., 272., 254., 241., 173.,   0., 271., 233.,\n",
       "        272., 239., 183., 216., 284., 227., 230., 239., 259.],\n",
       "       [204., 208., 273., 248., 261., 291., 274., 314., 271.,   0., 243.,\n",
       "        279., 218., 225., 223., 203., 191., 217., 194., 246.],\n",
       "       [243., 268., 283., 249., 289., 138., 283., 293., 233., 243.,   0.,\n",
       "        135., 224., 173., 206., 269., 233., 187., 218., 239.],\n",
       "       [267., 277., 298., 258., 259., 165., 274., 278., 272., 279., 135.,\n",
       "          0., 236., 203., 200., 272., 260., 199., 248., 263.],\n",
       "       [239., 233., 265., 220., 268., 260., 219., 256., 239., 218., 224.,\n",
       "        236.,   0., 212., 216., 177., 178., 159., 144., 211.],\n",
       "       [225., 216., 277., 193., 259., 239., 279., 246., 183., 225., 173.,\n",
       "        203., 212.,   0., 179., 260., 216., 169., 182., 258.],\n",
       "       [223., 247., 264., 195., 237., 248., 234., 265., 216., 223., 206.,\n",
       "        200., 216., 179.,   0., 240., 217., 192., 192., 218.],\n",
       "       [272., 281., 277., 256., 292., 281., 315., 262., 284., 203., 269.,\n",
       "        272., 177., 260., 240.,   0., 151., 225., 180., 223.],\n",
       "       [254., 247., 249., 237., 281., 281., 304., 213., 227., 191., 233.,\n",
       "        260., 178., 216., 217., 151.,   0., 218., 187., 238.],\n",
       "       [229., 241., 223., 208., 228., 253., 236., 263., 230., 217., 187.,\n",
       "        199., 159., 169., 192., 225., 218.,   0., 207., 249.],\n",
       "       [230., 254., 283., 229., 274., 248., 282., 262., 239., 194., 218.,\n",
       "        248., 144., 182., 192., 180., 187., 207.,   0., 214.],\n",
       "       [246., 256., 306., 241., 273., 236., 239., 259., 259., 246., 239.,\n",
       "        263., 211., 258., 218., 223., 238., 249., 214.,   0.]])"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# here's a one-liner to create the distance matrix in python\n",
    "D = np.array([tcrdist(t1,t2) for t1 in tcrs for t2 in tcrs]).reshape((len(tcrs),len(tcrs)))\n",
    "D"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# compute a distance matrix using the (faster) C++ code "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "util.run_command: cmd= /home/pbradley/gitrepos/conga/tcrdist_cpp/bin/find_neighbors -f tmp_tcrdists5345_tcrs.tsv --only_tcrdists -d /home/pbradley/gitrepos/conga/tcrdist_cpp/db/tcrdist_info_human.txt -o tmp_tcrdists5345\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# or we can use this wrapper around the C++ implementation if we have a longer list\n",
    "# or really want it to be fast\n",
    "D2 = conga.preprocess.calc_tcrdist_matrix_cpp(tcrs,'human')\n",
    "# this should be True\n",
    "np.all(D2==D)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
